from datetime import datetime, timedelta, UTC
from typing import Any, Dict, List, Optional, Tuple

import requests
import json
import html
import re
from langchain_core.language_models import BaseLLM
from loguru import logger

from ...utilities.search_utilities import remove_think_tags
from ..search_engine_base import BaseSearchEngine
from ...utilities.search_utilities import LANGUAGE_CODE_MAP

HEADERS = {
    "User-Agent": "local-deep-research-wikinews-search-engine (github.com/LearningCircuit/local-deep-research)"
}
WIKINEWS_LANGUAGES = [
    "ru",
    "sr",
    "pt",
    "fr",
    "pl",
    "en",
    "zh",
    "de",
    "it",
    "es",
    "cs",
    "nl",
    "ca",
    "ar",
    "ja",
]
TIMEOUT = 5  # Seconds
TIME_PERIOD_DELTAS = {
    "all": None,  # No time filter
    "y": timedelta(days=365),  # 1 year
    "m": timedelta(days=30),  # 1 month
    "w": timedelta(days=7),  # 1 week
    "d": timedelta(days=1),  # 24 hours
}
DEFAULT_RECENT_BACKWARD_DAYS = 60
MAX_RETRIES = 3


class WikinewsSearchEngine(BaseSearchEngine):
    """Wikinews search engine implementation with LLM query optimization"""

    # Mark as public and news search engine
    is_public = True
    is_news = True

    def __init__(
        self,
        search_language: str = "english",
        adaptive_search: bool = True,
        time_period: str = "y",
        llm: Optional[BaseLLM] = None,
        max_filtered_results: Optional[int] = None,
        max_results: int = 10,
        search_snippets_only: bool = True,
        **kwargs,
    ):
        """
        Initialize the Wikinews search engine.

        Args:
            search_language (str): Language for Wikinews search (e.g. "english").
            adaptive_search (bool): Whether to expand or shrink date ranges based on query.
            time_period (str): Defines the look-back window used to filter search results ("all", "y", "m", "w", "d").
            llm (Optional[BaseLLM]): Language model used for query optimization and classification.
            max_filtered_results (Optional[int]): Maximum number of results to keep after filtering.
            max_results (int): Maximum number of search results to return.
            search_snippets_only (bool): If True, full article content is ignored.
        """

        super().__init__(
            llm=llm,
            max_filtered_results=max_filtered_results,
            max_results=max_results,
            search_snippets_only=search_snippets_only,
            **kwargs,
        )

        # Language initialization
        lang_code = LANGUAGE_CODE_MAP.get(
            search_language.lower(),
            "en",  # Default to English if not found
        )

        if lang_code not in WIKINEWS_LANGUAGES:
            logger.warning(
                f"Wikinews does not support language '{search_language}' ({lang_code}). Defaulting to English."
            )
            lang_code = "en"

        self.lang_code: str = lang_code

        # Adaptive search
        self.adaptive_search: bool = adaptive_search

        # Date range initialization
        now = datetime.now(UTC)
        delta = TIME_PERIOD_DELTAS.get(time_period, timedelta(days=365))
        self.from_date: datetime = (
            now - delta if delta else datetime.min.replace(tzinfo=UTC)
        )
        self.to_date: datetime = now

        # Preserve original date range so adaptive search can restore it
        self._original_date_range = (self.from_date, self.to_date)

        # API base URL
        self.api_url: str = "https://{lang_code}.wikinews.org/w/api.php"

    def _optimize_query_for_wikinews(self, query: str) -> str:
        """
        Optimize a natural language query for Wikinews search.
        Uses LLM to transform questions into effective news search queries.

        Args:
            query (str): Natural language query

        Returns:
            Optimized search query for Wikinews
        """
        if not self.llm:
            return query

        try:
            # Prompt for query optimization
            prompt = f"""You are a query condenser. Your task is to transform the user’s natural-language question into a very short Wikinews search query.

Input question:
"{query}"

STRICT OUTPUT REQUIREMENTS (follow ALL of them):
1. Return ONLY a JSON object with EXACTLY one field: {{"query": "<refined_query>"}}.
2. The JSON must be valid, minified, and contain no trailing text.
3. The refined query must be extremely short: MAXIMUM 3–4 words.
4. Include only the essential keywords (proper names, events, entities, places).
5. Remove filler words (e.g., "news", "latest", "about", "what", "how", "is").
6. DO NOT add Boolean operators (AND, OR).
7. DO NOT use quotes inside the query.
8. DO NOT add explanations or comments.

EXAMPLES:
- "What's the impact of rising interest rates on UK housing market?" → {{"query": "UK housing rates"}}
- "Latest developments in the Ukraine-Russia peace negotiations" → {{"query": "Ukraine Russia negotiations"}}
- "How are tech companies responding to AI regulation?" → {{"query": "tech AI regulation"}}
- "What is Donald Trump's current political activity?" → {{"query": "Trump political activity"}}

NOW RETURN ONLY THE JSON OBJECT.
"""
            # Get response from LLM
            response = self.llm.invoke(prompt)

            response_text = (
                getattr(response, "content", None)
                or getattr(response, "text", None)
                or str(response)
            )

            # Find possible JSON object boundaries
            start = response_text.find("{")
            end = response_text.rfind("}")

            # Validate boundaries before slicing
            if start == -1 or end == -1 or end <= start:
                raise ValueError("No valid JSON boundaries found")

            json_str = response_text[start : end + 1]

            data = json.loads(json_str)

            if not isinstance(data, dict):
                raise ValueError("Extracted JSON is not an object")

            optimized_query = (data.get("query", "")).strip()

            if not optimized_query:
                raise ValueError("Query field missing or empty")

        except (
            ValueError,
            TypeError,
            AttributeError,
            json.JSONDecodeError,
        ) as e:
            logger.warning(
                f"Error optimizing query for Wikinews: {e}. Using original query."
            )
            return query

        logger.info(f"Original query: '{query}'")
        logger.info(f"Optimized for Wikinews: '{optimized_query}'")

        return optimized_query

    def _adapt_date_range_for_query(self, query: str) -> None:
        """
        Adapt the date range based on the query type (historical vs recent events).

        Args:
            query (str): The search query
        """
        # Reset to original date parameters first
        self.from_date, self.to_date = self._original_date_range

        if not self.adaptive_search or not self.llm:
            return

        # Do not adapt for very short queries (no enough context)
        if len(query.split()) <= 4:
            return

        try:
            prompt = f"""Classify this query based on temporal scope.

Query: "{query}"

Current date: {datetime.now(UTC).strftime("%Y-%m-%d")}
Cutoff: Events within the last {DEFAULT_RECENT_BACKWARD_DAYS} days are CURRENT

Classification rules:
- CURRENT: Recent events (last {DEFAULT_RECENT_BACKWARD_DAYS} days), ongoing situations, "latest", "recent", "today", "this week"
- HISTORICAL: Events before {DEFAULT_RECENT_BACKWARD_DAYS} days ago, timelines, chronologies, past tense ("what happened", "history of")
- UNCLEAR: Ambiguous temporal context

Respond with ONE WORD ONLY: CURRENT, HISTORICAL, or UNCLEAR"""
            # Get response from LLM
            response = self.llm.invoke(prompt)
            response_text = (
                getattr(response, "content", None)
                or getattr(response, "text", None)
                or str(response)
            )
            answer = remove_think_tags(response_text).upper()

            if "CURRENT" in answer:
                # For current events, focus on recent content
                logger.info(
                    f"Query '{query}' classified as CURRENT - focusing on recent content"
                )
                self.from_date = datetime.now(UTC) - timedelta(
                    days=DEFAULT_RECENT_BACKWARD_DAYS
                )
            elif "HISTORICAL" in answer:
                # For historical queries, go back as far as possible
                logger.info(
                    f"Query '{query}' classified as HISTORICAL - extending search timeframe"
                )
                self.from_date = datetime.min.replace(tzinfo=UTC)
            else:
                logger.info(
                    f"Query '{query}' classified as UNCLEAR - keeping original date range"
                )

        except (AttributeError, TypeError, ValueError, RuntimeError) as e:
            # Keep original date parameters on error
            logger.exception(
                f"Error adapting date range for query: {e}. Keeping original date range."
            )

    def _fetch_search_results(
        self, query: str, sroffset: int
    ) -> List[Dict[str, Any]]:
        """Fetch search results from Wikinews API.

        Args:
            query (str): The search query.
            sroffset (int): The result offset for pagination.

        Returns:
            List of search result items.
        """
        retries = 0
        while retries < MAX_RETRIES:
            params = {
                "action": "query",
                "list": "search",
                "srsearch": query,
                "srprop": "snippet|timestamp",
                "srlimit": 50,
                "sroffset": sroffset,
                "format": "json",
            }

            # Apply rate limiting before search request
            self._last_wait_time = self.rate_tracker.apply_rate_limit(
                self.engine_type
            )

            try:
                response = requests.get(
                    self.api_url.format(lang_code=self.lang_code),
                    params=params,
                    headers=HEADERS,
                    timeout=TIMEOUT,
                )
                response.raise_for_status()
                data = response.json()
                return data.get("query", {}).get("search", [])
            except (
                requests.exceptions.RequestException,
                json.JSONDecodeError,
            ) as e:
                logger.warning(
                    f"Error fetching search results: {e}, retrying..."
                )
                retries += 1

        return []

    def _process_search_result(
        self, result: Dict[str, Any], query: str
    ) -> Optional[Dict[str, Any]]:
        """Process and filter a single search result.

        Args:
            result (Dict[str, Any]): A single search result item.
            query (str): The search query.

        Returns:
            Processed result or None if filtered out.
        """
        page_id = result.get("pageid")
        title = result.get("title", "")
        snippet = _clean_wikinews_snippet(result.get("snippet", ""))

        try:
            last_edit_timestamp = result.get("timestamp", "")
            last_edit_date = datetime.fromisoformat(
                last_edit_timestamp.replace("Z", "+00:00")
            )
        except ValueError:
            logger.warning(
                f"Error parsing last edit date for page {page_id}, using current date as fallback."
            )
            last_edit_date = datetime.now(UTC)

        # First filter: last edit date must be after from_date
        if last_edit_date < self.from_date:
            # In this case we can skip fetching full content
            return None

        # Fetch full article content and extract actual publication date
        # Note: Wikinews API do not allow to retrieve publication date in batched search results
        full_content, publication_date = self._fetch_full_content_and_pubdate(
            page_id, last_edit_date
        )

        # Second filter: publication date within range
        if publication_date < self.from_date or publication_date > self.to_date:
            return None

        # Third filter: check if all query words are in title or content (naive approach)
        # Note: Wikinews search return false positive if query words are in "related" articles section
        combined_text = f"{title} {full_content}".lower()
        if not all(word.lower() in combined_text for word in query.split()):
            return None

        # If only snippets are requested, we use snippet as full content
        if self.search_snippets_only:
            full_content = snippet

        return {
            "id": page_id,
            "title": title,
            "snippet": snippet,
            "source": "wikinews",
            "url": f"https://{self.lang_code}.wikinews.org/?curid={page_id}",  # Used by '_filter_for_relevance' function
            "link": f"https://{self.lang_code}.wikinews.org/?curid={page_id}",  # Used by citation handler
            "content": full_content,
            "full_content": full_content,
            "publication_date": publication_date.isoformat(timespec="seconds"),
        }

    def _fetch_full_content_and_pubdate(
        self, page_id: int, fallback_date: datetime
    ) -> Tuple[str, datetime]:
        """Fetch full article content and publication date from Wikinews API.

        Args:
            page_id (int): The Wikinews page ID.
            fallback_date (datetime): Fallback date if publication date cannot be determined.

        Returns:
            Tuple of (full_content, publication_date)
        """
        try:
            content_params = {
                "action": "query",
                "prop": "revisions|extracts",
                "pageids": page_id,
                "rvprop": "timestamp",
                "rvdir": "newer",  # Older revisions first
                "rvlimit": 1,  # Get the first revision (i.e. publication)
                "explaintext": True,
                "format": "json",
            }

            # Apply rate limiting before content request
            self._last_wait_time = self.rate_tracker.apply_rate_limit(
                self.engine_type
            )

            content_resp = requests.get(
                self.api_url.format(lang_code=self.lang_code),
                params=content_params,
                headers=HEADERS,
                timeout=TIMEOUT,
            )
            content_resp.raise_for_status()
            content_data = content_resp.json()

            page_data = (
                content_data.get("query", {})
                .get("pages", {})
                .get(str(page_id), {})
            )
            full_content = page_data.get("extract", "")
            revisions = page_data.get("revisions", [])

            if revisions:
                try:
                    # First revision timestamp is the publication date
                    publication_date = datetime.fromisoformat(
                        revisions[0]["timestamp"].replace("Z", "+00:00")
                    )
                except ValueError:
                    logger.warning(
                        f"Error parsing publication date for page {page_id}, using fallback date."
                    )
                    publication_date = fallback_date
            else:
                logger.warning(
                    f"No revisions found for page {page_id}, using fallback date."
                )
                publication_date = fallback_date

            return full_content, publication_date

        except (
            requests.exceptions.RequestException,
            json.JSONDecodeError,
        ) as e:
            logger.warning(f"Error fetching content for page {page_id}: {e}")
            return "", fallback_date

    def _get_previews(self, query: str) -> List[Dict[str, Any]]:
        """
        Retrieve article previews from Wikinews based on the query.

        Args:
            query (str): The search query

        Returns:
            List of relevant article previews
        """
        # Adapt date range based on query and optimize query (if LLM is available)
        self._adapt_date_range_for_query(query)
        optimized_query = self._optimize_query_for_wikinews(query)

        articles = []
        sroffset = 0

        while len(articles) < self.max_results:
            search_results = self._fetch_search_results(
                optimized_query, sroffset
            )
            if not search_results:
                # No more results available (or multiple retries failed)
                break

            for result in search_results:
                article = self._process_search_result(result, optimized_query)
                if article:
                    articles.append(article)
                if len(articles) >= self.max_results:
                    break

            sroffset += len(search_results)

        return articles

    def _get_full_content(
        self, relevant_items: List[Dict[str, Any]]
    ) -> List[Dict[str, Any]]:
        """
        Retrieve full content for relevant Wikinews articles.

        Args:
            relevant_items (List[Dict[str, Any]]): List of relevant article previews

        Returns:
            List of articles with full content
        """
        # Since full content is already fetched in _get_previews, just return relevant items
        return relevant_items


def _clean_wikinews_snippet(snippet: str) -> str:
    """
    Clean a Wikinews search snippet.

    Args:
        snippet (str): Raw snippet from Wikinews API

    Returns:
        Clean human-readable text
    """
    if not snippet:
        return ""

    # Unescape HTML entities
    unescaped = html.unescape(snippet)

    # Remove HTML tags
    clean_text = re.sub(r"<.*?>", "", unescaped)

    # Normalize whitespace
    clean_text = re.sub(r"\s+", " ", clean_text).strip()

    return clean_text
